library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(expm)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'expm'
## The following object is masked from 'package:Matrix':
##
## expm
library(ggplot2)
library(lmvar)
library(leaps)
library(RColorBrewer)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
perfume = read.csv("noon_perfumes_dataset.csv")
sum(is.na(perfume))
## [1] 0
head(perfume)
no empty value. good.
perfume = perfume %>%
mutate(scent = ifelse(scents == "Arabian", "Oriental", scents))
p1 = subset(perfume, scent != "Vanilla" & scent != "Aromatic" & scent != "Musk" & scent != "Jasmine" & scent != "Floral and Oriental" & scent != "Rose, Floral" & scent != "Sandalwood" & scent != "Woody, Sweet" & scent != "Aromatic,Citrus" & scent != "Clean" & scent != "Oriental, Floral" & scent != "Sweet Aromatic" & scent != "Woody And Spicy" & scent != "Woody, Musky")
p2 = p1 %>%
mutate(conc = ifelse(concentration == "PDT", "EDT", concentration))
p2 = subset(p2, select = -c(concentration))
p3 = p2 %>%
mutate(brands1 = ifelse(brand == "ST Dupont", "S.T.Dupont", brand)) %>%
mutate(brands2 = ifelse(brands1 == "armani", "GIORGIO ARMANI", brands1)) %>%
mutate(brands3 = ifelse(brands2 == "Genie Collection", "Genie", brands2)) %>%
mutate(brands4 = ifelse(brands3 == "LANVIN PARIS", "LANVIN", brands3)) %>%
mutate(brands5 = ifelse(brands4 == "Mont Blanc", "MONTBLANC", brands4)) %>%
mutate(brands6 = ifelse(brands5 == "marbert man", "Marbert", brands5)) %>%
mutate(brands = ifelse(brands6 == "YSL" | brands6 == "YVES", "Yves Saint Laurent", brands6))
p3 = subset(p3, select = -c(brand, brands1, brands2, brands3, brands4, brands5, brands6))
p4 = subset(p3, seller_rating <= 5.0)
p5 = p4 %>%
mutate(num_sel_ratings =
ifelse(grepl("K", num_seller_ratings),
as.numeric(substring(num_seller_ratings, 1, nchar(num_seller_ratings) - 1)) * 1000,
as.numeric(num_seller_ratings)))
## Warning in ifelse(grepl("K", num_seller_ratings),
## as.numeric(substring(num_seller_ratings, : NAs introduced by coercion
p5 = subset(p5, select = -c(num_seller_ratings))
# clean seller column
seller = as.vector(p5$seller)
seller = tolower(seller)
index_golden = which(grepl("golden", seller))
seller[index_golden] = "golden perfumes"
index_lolita = which(grepl("lolita", seller))
seller[index_lolita] = "lolita shop"
index_noon = which(grepl("noon", seller))
seller[index_noon] = "noon"
index_swiss = which(grepl("swiss", seller))
seller[index_swiss] = "swiss arabian perfumes"
index_pa = which(grepl("perfumes--addresses", seller))
seller[index_pa] = "perfumes"
index_ps = which(grepl("perfumes-shop", seller))
seller[index_ps] = "perfumes"
p6 = p5
p6$seller = seller
sb = c(48, 435, 651)
bf = c(109, 121, 470, 565, 576)
p6 = p6 %>%
mutate(seller1 = ifelse(is.element(X, sb), "show biz", seller)) %>%
mutate(sellers = ifelse(is.element(X, bf), "beauty fortune", seller))
p6 = subset(p6, select = -c(seller1, seller))
p6 = p6 %>%
filter(conc != "EDC")
base_note = as.vector(p6$base_note)
base_note = tolower(base_note)
base_note = str_replace_all(base_note, " and ", ",")
base_note = str_replace_all(base_note, " ", "")
base_note = str_replace_all(base_note, "vanille", "vanilla")
base_note = str_replace_all(base_note, "woodsynotes", "wood")
base_note = str_replace_all(base_note, "orrisroot", "orris")
base_note = str_replace_all(base_note, "woodsynote", "wood")
base_note = str_replace_all(base_note, "woodynotes", "wood")
base_note = str_replace_all(base_note, "woody", "wood")
base_note = str_replace_all(base_note, "cedarwood", "cedar")
base_note = str_replace_all(base_note, "virginiacedar", "cedar")
base_note = str_replace_all(base_note, "whitemusk", "musk")
base_note = str_replace_all(base_note, "tonkabeans", "tonka")
base_note = str_replace_all(base_note, "tonkabean", "tonka")
base_note = str_replace_all(base_note, "amberwood", "amber")
base_note = str_replace_all(base_note, "sandalwood", "sandal")
base_note = str_replace_all(base_note, "cashmerewood", "cashmere")
base_note = str_replace_all(base_note, "guaiacwood", "guaiac")
base_note = str_replace_all(base_note, "ambergris", "AMBERGRIS")
base_note = str_replace_all(base_note, "mustyoud", "oud")
base_note = str_replace_all(base_note, "naturaloudoil", "oud")
base_note = str_replace_all(base_note, "agarwood\\(oud\\)", "oud")
base_note = str_replace_all(base_note, "agarwood", "oud")
base_note = str_replace_all(base_note, "oudh", "oud")
p6$base_note = base_note
mid_note = as.vector(p6$middle_note)
mid_note = tolower(mid_note)
mid_note = str_replace_all(mid_note, " and ", ",")
mid_note = str_replace_all(mid_note, " ", "")
mid_note = str_replace_all(mid_note, "lily-of-the-valley", "lily")
mid_note = str_replace_all(mid_note, "orrisroot", "orris")
mid_note = str_replace_all(mid_note, "lilyofthevalley", "lily")
mid_note = str_replace_all(mid_note, "bulgarianrose", "rose")
mid_note = str_replace_all(mid_note, "africanorangeflower", "orangeblossom")
mid_note = str_replace_all(mid_note, "neroli", "orangeblossom")
mid_note = str_replace_all(mid_note, "jasminesambac", "jasmine")
mid_note = str_replace_all(mid_note, "wildjasmine", "jasmine")
mid_note = str_replace_all(mid_note, "wildjasmine", "jasmine")
mid_note = str_replace_all(mid_note, "blackpepper", "pepper")
mid_note = str_replace_all(mid_note, "pinkpepper", "pepper")
mid_note = str_replace_all(mid_note, "vanille", "vanilla")
mid_note = str_replace_all(mid_note, "tuberose", "TUBEROSE")
mid_note = str_replace_all(mid_note, "orrisroot", "ORRISROOT")
mid_note = str_replace_all(mid_note, "honeysuckle", "HONEYSUCKLE")
mid_note = str_replace_all(mid_note, "rosemary", "ROSEMARY")
mid_note = str_replace_all(mid_note, "violetleaf", "VIOLETLEAF")
mid_note = str_replace_all(mid_note, "clarysage", "CLARYSAGE")
mid_note = str_replace_all(mid_note, "oudh", "oud")
mid_note = str_replace_all(mid_note, "burningoud", "oud")
mid_note = str_replace_all(mid_note, "agarwood\\(oud\\)", "oud")
mid_note = str_replace_all(mid_note, "agarwood", "oud")
mid_note = str_replace_all(mid_note, "oudwood", "oud")
p6$middle_note = mid_note
p7 = p6 %>%
filter(ml > 5)
#
# # add ordinal version of ml
# vol = as.vector(p7$ml)
# unique_vol = as.data.frame(vol) %>%
# group_by(vol) %>%
# summarise(count = n()) %>%
# subset(select = vol)
# unique_vol = as.vector(unique_vol$vol)
#
# order = vol
# rank = 0
# for (i in unique_vol) {
# rank = rank + 1
# index = which(vol == i)
# order[index] = rank
# }
# p7$ml_order = order
# p7 = subset(p7, select = -c(ml))
p7 = p7 %>%
mutate(gender = ifelse(department == "Kids Unisex", "Unisex", department)) %>%
filter(middle_note != "shavingsoap")
perfume = subset(p7, select = -c(department, X, name, scents))
perfume = unique(perfume)
brand = as.vector(p7$brands)
brand = tolower(brand)
new_brands = as.data.frame(brand) %>%
group_by(brand) %>%
summarise(count = n()) %>%
arrange(desc(count))
big_brands = new_brands[which(new_brands$count > 10), ]$brand
perfume = perfume %>%
mutate(big_brand = ifelse(is.element(tolower(brands), big_brands), 1, 0))
perfume = subset(perfume, select = -c(brands))
perfume = perfume %>%
mutate(is_noon = ifelse(tolower(sellers) == 'noon', 1, 0))
perfume = subset(perfume, select = -c(sellers))
get_notes = function(base, middle) {
bnote = as.vector(unlist(strsplit(base, split = ",")))
mnote = as.vector(unlist(strsplit(middle, split = ",")))
return(union(bnote, mnote))
}
complexity = function(notes) {
return(length(notes))
}
luxury = function(notes) {
score = 0
for (i in 1:length(notes)) {
if (notes[i] == "musk" | notes[i] == "orris") { # 100-200
score = score + 1
} else if (notes[i] == "neroli" | notes[i] == "jasmine" | notes[i] == "sandal") { # 200-400
score = score + 2
} else if (notes[i] == "rose" | notes[i] == "tuberose") { # 400-800
score = score + 3
} else if (notes[i] == "AMBERGRIS") { # 800-1200
score = score + 4
} else if (notes[i] == "oud") { # 1200-1600
score = score + 5
} else {
score = score + 0
}
}
return(score)
}
N = nrow(perfume)
complex = lux = rep(0, N)
for (i in 1:N) {
complex[i] = complexity(get_notes(perfume[i, ]$base_note, perfume[i, ]$middle_note))
lux[i] = luxury(get_notes(perfume[i, ]$base_note, perfume[i, ]$middle_note))
}
comp_score = lux_score = rep(0, N)
for (i in 1:N) {
x = complex[i]
comp_score[i] = sum(complex <= x) / N * 100
y = lux[i]
lux_score[i] = sum(lux <= y) / N * 100
}
perfume = perfume %>%
mutate(comp = complex)
# (comp_score * lux_score) / 100
rse = function(model) {
sqrt(sum(model$residuals ^ 2) / model$df.residual)
}
r2 = function(model) {
summary(model)$adj.r.squared
}
mse = function(model) {
mean(model$residuals ^ 2)
}
ge = function(model) {
n = nobs(model)
ge = 2 * (rse(model) ^ 2) * length(model$coefficients) / n
return(ge)
}
Cp.lm = function(mdl.list) {
n = nobs(mdl.list[[1]])
DoFs = sapply(mdl.list, function(mdl) { sum(hatvalues(mdl)) })
MSEs = sapply(mdl.list, function(mdl) { mean(residuals(mdl)^2) })
biggest = which.max(DoFs)
sigma2.hat = MSEs[[biggest]]*n/(n-DoFs[[biggest]])
Cp = MSEs + 2*sigma2.hat*DoFs/n
return(Cp)
}
perfume = subset(perfume, select = -c(new_price, base_note, middle_note))
lm.1 = lm(old_price ~ ., data = perfume)
summary(lm.1)
##
## Call:
## lm(formula = old_price ~ ., data = perfume)
##
## Residuals:
## Min 1Q Median 3Q Max
## -377.71 -147.39 -16.41 115.27 1441.45
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.634e+01 1.671e+02 0.397 0.691464
## ml 1.279e+00 3.548e-01 3.606 0.000331 ***
## item_rating 3.702e+00 1.412e+01 0.262 0.793250
## seller_rating 5.571e+01 3.895e+01 1.430 0.153087
## scentFloral -1.275e+01 3.017e+01 -0.423 0.672596
## scentFresh -9.728e+01 4.215e+01 -2.308 0.021281 *
## scentFruity -5.955e+01 3.753e+01 -1.587 0.113008
## scentOriental -5.881e+01 3.575e+01 -1.645 0.100345
## scentSpicy -4.004e+01 3.322e+01 -1.205 0.228429
## scentWoody 9.180e+00 3.024e+01 0.304 0.761549
## concEDT -1.457e+02 1.966e+01 -7.410 3.29e-13 ***
## num_sel_ratings -1.254e-03 9.611e-04 -1.305 0.192323
## genderUnisex -1.179e+02 3.615e+01 -3.261 0.001160 **
## genderWomen -1.588e+01 2.213e+01 -0.718 0.473214
## big_brand 7.978e+01 1.597e+01 4.996 7.22e-07 ***
## is_noon 9.856e+01 9.317e+01 1.058 0.290441
## comp -3.123e+00 2.380e+00 -1.312 0.189851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 216.3 on 781 degrees of freedom
## Multiple R-squared: 0.1371, Adjusted R-squared: 0.1194
## F-statistic: 7.755 on 16 and 781 DF, p-value: < 2.2e-16
# residual analysis
plot(lm.1, which = 4)
dwtest(lm.1, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: lm.1
## DW = 1.916, p-value = 0.2273
## alternative hypothesis: true autocorrelation is not 0
set1 = lm.1$residuals[which(lm.1$fitted.values >= 300)]
set2 = lm.1$residuals[which(lm.1$fitted.values < 300)]
var.test(set1, set2)
##
## F test to compare two variances
##
## data: set1 and set2
## F = 1.39, num df = 499, denom df = 297, p-value = 0.001831
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.131064 1.699080
## sample estimates:
## ratio of variances
## 1.389972
perfume2 = perfume %>%
filter(old_price < 930)
lm.2 = lm(old_price ~ ., data = perfume2)
summary(lm.2)
##
## Call:
## lm(formula = old_price ~ ., data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -388.70 -132.69 -8.52 121.88 616.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.924e+01 1.364e+02 -0.361 0.718195
## ml 1.170e+00 2.878e-01 4.065 5.30e-05 ***
## item_rating 6.780e+00 1.152e+01 0.589 0.556282
## seller_rating 8.123e+01 3.181e+01 2.554 0.010849 *
## scentFloral -1.104e+01 2.451e+01 -0.451 0.652394
## scentFresh -1.164e+02 3.440e+01 -3.383 0.000754 ***
## scentFruity -6.434e+01 3.054e+01 -2.107 0.035429 *
## scentOriental -4.197e+01 2.897e+01 -1.449 0.147821
## scentSpicy -5.380e+01 2.697e+01 -1.995 0.046392 *
## scentWoody -2.651e+01 2.466e+01 -1.075 0.282736
## concEDT -1.201e+02 1.613e+01 -7.444 2.63e-13 ***
## num_sel_ratings -1.084e-03 7.783e-04 -1.393 0.164150
## genderUnisex -1.570e+02 2.980e+01 -5.270 1.78e-07 ***
## genderWomen -2.308e+01 1.820e+01 -1.268 0.205283
## big_brand 8.712e+01 1.306e+01 6.672 4.84e-11 ***
## is_noon 8.543e+01 7.543e+01 1.132 0.257784
## comp -4.513e+00 1.934e+00 -2.333 0.019893 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.1 on 767 degrees of freedom
## Multiple R-squared: 0.1923, Adjusted R-squared: 0.1754
## F-statistic: 11.41 on 16 and 767 DF, p-value: < 2.2e-16
# residual analysis
plot(lm.2)
plot(lm.2, which = 4)
dwtest(lm.2, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: lm.2
## DW = 1.8785, p-value = 0.08476
## alternative hypothesis: true autocorrelation is not 0
set1 = lm.2$residuals[which(lm.2$fitted.values >= 300)]
set2 = lm.2$residuals[which(lm.2$fitted.values < 300)]
var.test(set1, set2)
##
## F test to compare two variances
##
## data: set1 and set2
## F = 1.0427, num df = 451, denom df = 331, p-value = 0.6874
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.8515493 1.2727029
## sample estimates:
## ratio of variances
## 1.042677
# remove is_noon?
lm.3 = lm(old_price ~ big_brand + comp + item_rating +
conc + ml + num_sel_ratings +
gender + seller_rating + scent, data = perfume2)
summary(lm.3)
##
## Call:
## lm(formula = old_price ~ big_brand + comp + item_rating + conc +
## ml + num_sel_ratings + gender + seller_rating + scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -388.54 -132.61 -7.18 122.82 617.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.047e+01 1.364e+02 -0.370 0.711535
## big_brand 8.661e+01 1.305e+01 6.635 6.11e-11 ***
## comp -4.504e+00 1.935e+00 -2.328 0.020175 *
## item_rating 6.865e+00 1.152e+01 0.596 0.551454
## concEDT -1.212e+02 1.610e+01 -7.527 1.46e-13 ***
## ml 1.164e+00 2.878e-01 4.043 5.81e-05 ***
## num_sel_ratings -2.269e-04 1.822e-04 -1.245 0.213372
## genderUnisex -1.579e+02 2.980e+01 -5.298 1.53e-07 ***
## genderWomen -2.304e+01 1.821e+01 -1.265 0.206096
## seller_rating 8.164e+01 3.181e+01 2.566 0.010466 *
## scentFloral -1.172e+01 2.451e+01 -0.478 0.632632
## scentFresh -1.157e+02 3.440e+01 -3.365 0.000805 ***
## scentFruity -6.415e+01 3.054e+01 -2.101 0.036006 *
## scentOriental -4.053e+01 2.895e+01 -1.400 0.161917
## scentSpicy -5.251e+01 2.695e+01 -1.949 0.051710 .
## scentWoody -2.664e+01 2.467e+01 -1.080 0.280472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.1 on 768 degrees of freedom
## Multiple R-squared: 0.1909, Adjusted R-squared: 0.1751
## F-statistic: 12.08 on 15 and 768 DF, p-value: < 2.2e-16
anova(lm.2, lm.3)
# yes
# remove item_rating?
lm.4 = lm(old_price ~ big_brand + comp +
conc + ml + num_sel_ratings +
gender + seller_rating + scent, data = perfume2)
summary(lm.4)
##
## Call:
## lm(formula = old_price ~ big_brand + comp + conc + ml + num_sel_ratings +
## gender + seller_rating + scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -392.0 -131.7 -5.2 124.8 617.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.279e+01 1.282e+02 -0.178 0.858964
## big_brand 8.682e+01 1.304e+01 6.656 5.33e-11 ***
## comp -4.491e+00 1.934e+00 -2.323 0.020462 *
## concEDT -1.217e+02 1.607e+01 -7.574 1.04e-13 ***
## ml 1.162e+00 2.877e-01 4.040 5.88e-05 ***
## num_sel_ratings -2.291e-04 1.821e-04 -1.258 0.208764
## genderUnisex -1.581e+02 2.978e+01 -5.309 1.44e-07 ***
## genderWomen -2.190e+01 1.810e+01 -1.210 0.226568
## seller_rating 8.252e+01 3.177e+01 2.598 0.009566 **
## scentFloral -1.237e+01 2.447e+01 -0.506 0.613225
## scentFresh -1.165e+02 3.436e+01 -3.390 0.000734 ***
## scentFruity -6.452e+01 3.052e+01 -2.114 0.034848 *
## scentOriental -4.094e+01 2.893e+01 -1.415 0.157398
## scentSpicy -5.217e+01 2.693e+01 -1.937 0.053079 .
## scentWoody -2.661e+01 2.466e+01 -1.079 0.280857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.1 on 769 degrees of freedom
## Multiple R-squared: 0.1906, Adjusted R-squared: 0.1758
## F-statistic: 12.93 on 14 and 769 DF, p-value: < 2.2e-16
anova(lm.3, lm.4)
# yes
# remove num_sel_ratings?
lm.5 = lm(old_price ~ big_brand + comp +
conc + ml +
gender + seller_rating + scent, data = perfume2)
summary(lm.5)
##
## Call:
## lm(formula = old_price ~ big_brand + comp + conc + ml + gender +
## seller_rating + scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -401.66 -131.55 -6.68 118.22 620.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.1517 107.0097 0.618 0.536637
## big_brand 85.9311 13.0284 6.596 7.87e-11 ***
## comp -4.4495 1.9342 -2.300 0.021694 *
## concEDT -121.6950 16.0785 -7.569 1.08e-13 ***
## ml 1.1437 0.2874 3.979 7.57e-05 ***
## genderUnisex -159.7660 29.7651 -5.368 1.06e-07 ***
## genderWomen -21.7679 18.1052 -1.202 0.229617
## seller_rating 58.8679 25.6163 2.298 0.021826 *
## scentFloral -12.3393 24.4797 -0.504 0.614361
## scentFresh -115.8323 34.3721 -3.370 0.000789 ***
## scentFruity -64.5532 30.5335 -2.114 0.034821 *
## scentOriental -40.8892 28.9379 -1.413 0.158060
## scentSpicy -51.5216 26.9361 -1.913 0.056153 .
## scentWoody -26.3411 24.6668 -1.068 0.285910
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.1 on 770 degrees of freedom
## Multiple R-squared: 0.1889, Adjusted R-squared: 0.1752
## F-statistic: 13.79 on 13 and 770 DF, p-value: < 2.2e-16
anova(lm.4, lm.5)
# yes
# remove seller_rating?
lm.6 = lm(old_price ~ big_brand + comp +
conc + ml +
gender + scent, data = perfume2)
summary(lm.6)
##
## Call:
## lm(formula = old_price ~ big_brand + comp + conc + ml + gender +
## scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -390.30 -132.39 -3.85 126.65 613.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 293.1350 41.2872 7.100 2.84e-12 ***
## big_brand 88.5349 13.0150 6.803 2.06e-11 ***
## comp -4.6393 1.9378 -2.394 0.016900 *
## concEDT -123.0394 16.1124 -7.636 6.62e-14 ***
## ml 1.1934 0.2874 4.153 3.65e-05 ***
## genderUnisex -158.8406 29.8449 -5.322 1.35e-07 ***
## genderWomen -19.5736 18.1301 -1.080 0.280649
## scentFloral -11.9415 24.5470 -0.486 0.626767
## scentFresh -118.3690 34.4496 -3.436 0.000622 ***
## scentFruity -63.7574 30.6162 -2.082 0.037628 *
## scentOriental -38.2330 28.9949 -1.319 0.187692
## scentSpicy -52.7289 27.0056 -1.953 0.051239 .
## scentWoody -27.4623 24.7303 -1.110 0.267142
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.6 on 771 degrees of freedom
## Multiple R-squared: 0.1833, Adjusted R-squared: 0.1706
## F-statistic: 14.42 on 12 and 771 DF, p-value: < 2.2e-16
anova(lm.5, lm.6)
# no
# remove comp?
lm.7 = lm(old_price ~ big_brand +
conc + ml + seller_rating +
gender + scent, data = perfume2)
summary(lm.7)
##
## Call:
## lm(formula = old_price ~ big_brand + conc + ml + seller_rating +
## gender + scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -402.45 -134.17 -2.55 120.68 641.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.0020 106.1437 0.283 0.77752
## big_brand 90.4464 12.9155 7.003 5.46e-12 ***
## concEDT -124.4426 16.0786 -7.740 3.13e-14 ***
## ml 1.0706 0.2864 3.738 0.00020 ***
## seller_rating 61.3847 25.6641 2.392 0.01700 *
## genderUnisex -158.7315 29.8444 -5.319 1.37e-07 ***
## genderWomen -22.2882 18.1541 -1.228 0.21993
## scentFloral -10.8546 24.5392 -0.442 0.65837
## scentFresh -113.6252 34.4542 -3.298 0.00102 **
## scentFruity -62.4752 30.6050 -2.041 0.04156 *
## scentOriental -42.8740 29.0054 -1.478 0.13978
## scentSpicy -49.8770 27.0014 -1.847 0.06510 .
## scentWoody -24.4305 24.7213 -0.988 0.32335
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.6 on 771 degrees of freedom
## Multiple R-squared: 0.1833, Adjusted R-squared: 0.1706
## F-statistic: 14.42 on 12 and 771 DF, p-value: < 2.2e-16
anova(lm.5, lm.7)
# We cannot.
# remove gender?
lm.8 = lm(old_price ~ big_brand +
conc + ml + comp + seller_rating +
scent, data = perfume2)
summary(lm.8)
##
## Call:
## lm(formula = old_price ~ big_brand + conc + ml + comp + seller_rating +
## scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -360.11 -132.84 -3.47 127.08 635.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.9219 108.2072 0.304 0.76102
## big_brand 86.3603 13.2545 6.516 1.31e-10 ***
## concEDT -102.9945 14.6992 -7.007 5.32e-12 ***
## ml 1.1643 0.2885 4.036 5.99e-05 ***
## comp -4.2661 1.9676 -2.168 0.03045 *
## seller_rating 57.9550 26.0292 2.227 0.02627 *
## scentFloral -7.8200 24.3445 -0.321 0.74813
## scentFresh -108.3212 34.8832 -3.105 0.00197 **
## scentFruity -54.0073 30.7669 -1.755 0.07959 .
## scentOriental -49.1566 29.4035 -1.672 0.09497 .
## scentSpicy -44.6627 27.1583 -1.645 0.10047
## scentWoody -21.9729 24.9056 -0.882 0.37792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 178.2 on 772 degrees of freedom
## Multiple R-squared: 0.158, Adjusted R-squared: 0.146
## F-statistic: 13.17 on 11 and 772 DF, p-value: < 2.2e-16
anova(lm.5, lm.8)
# We cannot.
# remove concentration?
lm.9 = lm(old_price ~ big_brand + gender + comp + seller_rating +
ml + scent, data = perfume2)
summary(lm.9)
##
## Call:
## lm(formula = old_price ~ big_brand + gender + comp + seller_rating +
## ml + scent, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -381.77 -132.91 -10.13 127.95 675.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -45.8480 109.7821 -0.418 0.67634
## big_brand 84.8390 13.4948 6.287 5.42e-10 ***
## genderUnisex -100.3190 29.7397 -3.373 0.00078 ***
## genderWomen 35.9366 17.0106 2.113 0.03496 *
## comp -5.5370 1.9981 -2.771 0.00572 **
## seller_rating 65.9226 26.5174 2.486 0.01313 *
## ml 0.8317 0.2946 2.823 0.00488 **
## scentFloral 18.2723 25.0091 0.731 0.46523
## scentFresh -85.8633 35.3676 -2.428 0.01542 *
## scentFruity -30.8914 31.2911 -0.987 0.32384
## scentOriental 5.0601 29.3085 0.173 0.86297
## scentSpicy -34.3525 27.8029 -1.236 0.21699
## scentWoody -2.0542 25.3342 -0.081 0.93539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 181.4 on 771 degrees of freedom
## Multiple R-squared: 0.1285, Adjusted R-squared: 0.115
## F-statistic: 9.477 on 12 and 771 DF, p-value: < 2.2e-16
anova(lm.5, lm.9)
# No.
# remove scent?
lm.10 = lm(old_price ~ big_brand + gender + ml + conc + seller_rating + comp, data = perfume2)
summary(lm.10)
##
## Call:
## lm(formula = old_price ~ big_brand + gender + ml + conc + seller_rating +
## comp, data = perfume2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -401.68 -132.00 -7.78 117.95 657.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.3073 105.0236 0.127 0.8992
## big_brand 85.8179 13.0461 6.578 8.76e-11 ***
## genderUnisex -151.7611 29.6931 -5.111 4.04e-07 ***
## genderWomen -8.4502 16.2998 -0.518 0.6043
## ml 1.1538 0.2877 4.011 6.64e-05 ***
## concEDT -116.2215 15.7987 -7.356 4.81e-13 ***
## seller_rating 61.2999 25.7333 2.382 0.0175 *
## comp -4.3263 1.9427 -2.227 0.0262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 176.5 on 776 degrees of freedom
## Multiple R-squared: 0.1697, Adjusted R-squared: 0.1622
## F-statistic: 22.66 on 7 and 776 DF, p-value: < 2.2e-16
anova(lm.5, lm.10)
# No.
# RSE
rses = c(rse(lm.1), rse(lm.2), rse(lm.3), rse(lm.4), rse(lm.5))
# R^2
r2s = c(r2(lm.1), r2(lm.2), r2(lm.3), r2(lm.4), r2(lm.5))
# MSE
mses = c(mse(lm.1), mse(lm.2), mse(lm.3), mse(lm.4), mse(lm.5))
# generalization error
ges = c(ge(lm.1), ge(lm.2), ge(lm.3), ge(lm.4), ge(lm.5))
# Marlow's Cp
Cps = Cp.lm(list(lm.1, lm.2, lm.3, lm.4, lm.5))
# AIC
aics = AIC(lm.1, lm.2, lm.3, lm.4, lm.5)[, 2]
## Warning in AIC.default(lm.1, lm.2, lm.3, lm.4, lm.5): models are not all fitted
## to the same number of observations
# BIC
bics = BIC(lm.1, lm.2, lm.3, lm.4, lm.5)[, 2]
## Warning in BIC.default(lm.1, lm.2, lm.3, lm.4, lm.5): models are not all fitted
## to the same number of observations
# cannot remove scent, conc
metrics = data.frame(rses, r2s, mses, ges, Cps, aics, bics); metrics
## 5-fold CV
pre.ols = rep(0, nrow(perfume2))
pre.best = rep(0, nrow(perfume2))
folds = 5
sb = round(seq(0, nrow(perfume2), length = (folds + 1)))
for (i in 1:folds) {
test = (sb[((folds + 1) - i)] + 1):(sb[((folds + 2) - i)])
train = (1:nrow(perfume2))[-test]
## fit models
fit.ols = lm(old_price ~ ., data = perfume2[train, ])
fit.best = lm(old_price ~ big_brand + comp + conc + ml +
gender + seller_rating + scent, data = perfume2[train, ])
## create predictions
pre.ols[test] = predict(fit.ols, newdata = perfume2[test, ])
pre.best[test] = predict(fit.best, newdata = perfume2[test, ])
}
## Finally, compute the mean squared prediction error:
mean((perfume2$old_price - pre.ols) ^ 2)
## [1] 32890.85
mean((perfume2$old_price - pre.best) ^ 2)
## [1] 31796.32
price_by_scent = perfume2 %>%
group_by(scent) %>%
summarise(avg_price = mean(old_price))
count_by_scent = perfume2 %>%
group_by(scent) %>%
summarise(count = n())
scent_df = data.frame(price_by_scent, count_by_scent[, 2]); scent_df
perfume3 = perfume2 %>%
mutate(is.fresh = ifelse(scent == "Fresh", 1, 0)) %>%
mutate(is.unisex = ifelse(gender == "Unisex", 1, 0))
lm.11 = lm(old_price ~ big_brand + conc + comp + seller_rating + ml + is.unisex + is.fresh, data = perfume3)
summary(lm.11)
##
## Call:
## lm(formula = old_price ~ big_brand + conc + comp + seller_rating +
## ml + is.unisex + is.fresh, data = perfume3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -397.8 -130.0 -8.3 117.3 651.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.3471 104.0130 0.224 0.82246
## big_brand 85.5629 12.9732 6.595 7.84e-11 ***
## concEDT -111.0210 13.5369 -8.201 9.83e-16 ***
## comp -4.4408 1.9320 -2.298 0.02180 *
## seller_rating 56.8586 25.5389 2.226 0.02628 *
## ml 1.2119 0.2797 4.332 1.67e-05 ***
## is.unisex -146.3030 27.4428 -5.331 1.28e-07 ***
## is.fresh -85.3706 28.5594 -2.989 0.00289 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 175.5 on 776 degrees of freedom
## Multiple R-squared: 0.1789, Adjusted R-squared: 0.1714
## F-statistic: 24.15 on 7 and 776 DF, p-value: < 2.2e-16
# residual analysis
plot(lm.11)
plot(lm.11, which = 4)
dwtest(lm.11, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: lm.11
## DW = 1.8654, p-value = 0.0573
## alternative hypothesis: true autocorrelation is not 0
set1 = lm.11$residuals[which(lm.11$fitted.values >= 300)]
set2 = lm.11$residuals[which(lm.11$fitted.values < 300)]
var.test(set1, set2)
##
## F test to compare two variances
##
## data: set1 and set2
## F = 0.94906, num df = 473, denom df = 309, p-value = 0.6078
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7729524 1.1602054
## sample estimates:
## ratio of variances
## 0.9490633
lm.12 = lm(old_price ~ ., data = perfume3)
lm.13 = lm(old_price ~ big_brand + comp + item_rating +
conc + ml + num_sel_ratings +
is.unisex + seller_rating + is.fresh, data = perfume3)
lm.14 = lm(old_price ~ big_brand + comp +
conc + ml + num_sel_ratings +
is.unisex + seller_rating + is.fresh, data = perfume3)
lm.15 = lm(old_price ~ big_brand + comp +
conc + ml + is.unisex + seller_rating + is.fresh, data = perfume3)
# RSE
rses2 = c(rse(lm.12), rse(lm.13), rse(lm.14), rse(lm.15))
# R^2
r2s2 = c(r2(lm.12), r2(lm.13), r2(lm.14), r2(lm.15))
# MSE
mses2 = c(mse(lm.12), mse(lm.13), mse(lm.14), mse(lm.15))
# generalization error
ges2 = c(ge(lm.12), ge(lm.13), ge(lm.14), ge(lm.15))
# Marlow's Cp
Cps2 = Cp.lm(list(lm.12, lm.13, lm.14, lm.15))
# AIC
aics2 = AIC(lm.12, lm.13, lm.14, lm.15)[, 2]
# BIC
bics2 = BIC(lm.12, lm.13, lm.14, lm.15)[, 2]
# cannot remove scent, conc
metrics = data.frame(rses2, r2s2, mses2, ges2, Cps2, aics2, bics2); metrics
palette(brewer.pal(n = 12, name = "Paired"))
# Department
p7 %>%
group_by(department) %>%
summarise(count = n())
dept_slice <- c(379, 50, 461)
lbls <- c("Men", "Unisex", "Women")
pie(dept_slice, labels = lbls, main="Pie Chart of Departments", col = c("#A6CEE3", "#ffffff", "#CAB2D6"))
# Brands
p3 %>%
group_by(brands) %>%
summarise(count = n())
brand_slice <- perfume %>%
group_by(big_brand) %>%
summarise(count = n())
pie(brand_slice$count, labels = c("Niche", "Big Brands"), main="Pie Chart of Brand", col = c("#CAB2D6", "#ffffff"))
# Scent
scent_count <- perfume %>%
group_by(scent) %>%
summarise(count = n())
pie(scent_count$count, labels = scent_count$scent, main="Pie Chart of Scents", col = c("#A6CEE3", "#ffffff", "#CAB2D6", "#FFFF99", "#FDBF6F", "#FB9A99", "#B2DF8A"))
scent_slice <- perfume3 %>%
group_by(is.fresh) %>%
summarise(count = n())
pie(scent_slice$count, labels = c("Others", "Fresh"), main="Pie Chart of Merged Scents", col = c("#A6CEE3", "#ffffff"))
# Price
hist(perfume$old_price, main = "Histogram of Price", xlab = "Price", col = "#A6CEE3", breaks = 20)
# score
hist(perfume$comp, main = "Histogram of Score", xlab = "Score", col = "#CAB2D6")
# Seller
p1 %>%
group_by(seller) %>%
summarise(count = n())
seller_slice <- perfume %>%
group_by(is_noon) %>%
summarise(count = n())
pie(seller_slice$count, labels = c("Non-noon", "Noon"), main="Pie Chart of Seller", col = c("#A6CEE3", "#ffffff"))
# volume
p1 %>%
group_by(ml) %>%
summarise(count = n())
ml_count <- perfume %>%
group_by(ml) %>%
summarise(count = n())
barplot(ml_count$count, names.arg = ml_count$ml, main = "Barplot for Volume", xlab = "Volume", col = "#A6CEE3")
# conc
conc_slice <- perfume %>%
group_by(conc) %>%
summarise(count = n())
pie(conc_slice$count, labels = conc_slice$conc, main="Pie Chart of Concentration", col = c("#CAB2D6", "#A6CEE3"))
# Item rating
hist(perfume$item_rating, col = "#A6CEE3", main = "Histogram of Item Rating", xlab = "Item Rating")
# seller rating
hist(perfume$seller_rating, col = "#CAB2D6", main = "Histogram of Seller Rating", xlab = "Seller Rating")
# num seller rating
hist(perfume$num_sel_ratings, col = "#A6CEE3", main = "Histogram of Num Seller Rating", xlab = "Num Seller Rating")
# Comp
hist(perfume$comp, breaks = 20, main = "Histogram of Complexity", xlab = "Complexity", col = "#CAB2D6")
as.data.frame(base_note) %>%
group_by(base_note) %>%
summarise(count = n())